import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings
demo = pd.read_csv("Concrete_Data_Yeh.csv")
demo.head()
| cement | slag | flyash | water | superplasticizer | coarseaggregate | fineaggregate | age | csMPa | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 540.0 | 0.0 | 0.0 | 162.0 | 2.5 | 1040.0 | 676.0 | 28 | 79.99 |
| 1 | 540.0 | 0.0 | 0.0 | 162.0 | 2.5 | 1055.0 | 676.0 | 28 | 61.89 |
| 2 | 332.5 | 142.5 | 0.0 | 228.0 | 0.0 | 932.0 | 594.0 | 270 | 40.27 |
| 3 | 332.5 | 142.5 | 0.0 | 228.0 | 0.0 | 932.0 | 594.0 | 365 | 41.05 |
| 4 | 198.6 | 132.4 | 0.0 | 192.0 | 0.0 | 978.4 | 825.5 | 360 | 44.30 |
demo.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1030 entries, 0 to 1029 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 cement 1030 non-null float64 1 slag 1030 non-null float64 2 flyash 1030 non-null float64 3 water 1030 non-null float64 4 superplasticizer 1030 non-null float64 5 coarseaggregate 1030 non-null float64 6 fineaggregate 1030 non-null float64 7 age 1030 non-null int64 8 csMPa 1030 non-null float64 dtypes: float64(8), int64(1) memory usage: 72.5 KB
from matplotlib import pyplot as plt
import seaborn as sns
sns.pairplot(demo,diag_kind='kde')
<seaborn.axisgrid.PairGrid at 0x221b433bac0>
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
scaleddemo = pd.DataFrame(sc.fit_transform(demo))
scaleddemo.columns = demo.columns
sns.pairplot(scaleddemo,diag_kind='kde')
plt.show()
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
transformedage = pt.fit_transform(demo[['age']])
#sns.distplot(demo['age'])
sns.distplot(transformedage)
plt.show()
C:\Users\MAYUR\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
sns.pairplot(pd.DataFrame(pt.fit_transform(demo)),diag_kind='kde')
<seaborn.axisgrid.PairGrid at 0x221bafeb070>
sns.heatmap(demo.corr(),annot=True)
<AxesSubplot:>
import pandas as pd
from pandas_profiling import ProfileReport
design_report = ProfileReport(demo)
design_report.to_file(output_file='report.html')
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
df = AV.AutoViz('Concrete_Data_Yeh.csv')
Alert! from version 0.1.42, after importing, you must do '%matplotlib inline' to display charts in Jupyter Notebooks.
AV = AutoViz_Class()
dfte = AV.AutoViz(filename, sep=',', depVar='', dfte=None, header=0, verbose=0, lowess=False,
chart_format='svg',max_rows_analyzed=150000,max_cols_analyzed=30, save_plot_dir=None)
Note: verbose=0 or 1 generates charts and displays them in your local Jupyter notebook.
verbose=2 does not display plots but saves them in AutoViz_Plots folder in local machine.
Updated: chart_format='bokeh' generates and displays charts in your local Jupyter notebook.
chart_format='server' generates and displays charts in the browser - one tab for each chart.
chart_format='html' silently saves charts HTML format - they are also interactive!
Shape of your Data Set loaded: (1030, 9)
#######################################################################################
######################## C L A S S I F Y I N G V A R I A B L E S ####################
#######################################################################################
Classifying variables in data set...
9 Predictors classified...
No variables removed since no ID or low-information variables found in data set
Number of All Scatter Plots = 36
Time to run AutoViz = 4 seconds
###################### AUTO VISUALIZATION Completed ########################
import dtale
import pandas as pd
d = dtale.show(demo)
d.open_browser()
#Basic Linear Regression Model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
X=demo.drop('csMPa',axis=1)
#X=demo[['cement','age','water']]
y=demo[['csMPa']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20, random_state=10)
lr = LinearRegression()
lr.fit(Xtrain,ytrain)
print("Training R2")
print(lr.score(Xtrain,ytrain))
print("Testing R2")
print(lr.score(Xtest,ytest))
Training R2 0.6210697467843866 Testing R2 0.5911879648718985
X.head()
| cement | slag | flyash | water | superplasticizer | coarseaggregate | fineaggregate | age | |
|---|---|---|---|---|---|---|---|---|
| 0 | 540.0 | 0.0 | 0.0 | 162.0 | 2.5 | 1040.0 | 676.0 | 28 |
| 1 | 540.0 | 0.0 | 0.0 | 162.0 | 2.5 | 1055.0 | 676.0 | 28 |
| 2 | 332.5 | 142.5 | 0.0 | 228.0 | 0.0 | 932.0 | 594.0 | 270 |
| 3 | 332.5 | 142.5 | 0.0 | 228.0 | 0.0 | 932.0 | 594.0 | 365 |
| 4 | 198.6 | 132.4 | 0.0 | 192.0 | 0.0 | 978.4 | 825.5 | 360 |
lr.coef_
array([[ 0.12226049, 0.10551064, 0.09218552, -0.15212926, 0.25469802,
0.02128236, 0.02197247, 0.11598932]])
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
X=demo.drop('csMPa',axis=1)
y=demo[['csMPa']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
sc=StandardScaler()
scaledXtrain = sc.fit_transform(Xtrain)
scaledXtest = sc.transform(Xtest)
lr = LinearRegression()
lr.fit(scaledXtrain,ytrain)
print("Training R2")
print(lr.score(scaledXtrain,ytrain))
print("Testing R2")
print(lr.score(scaledXtest,ytest))
Training R2 0.6210697467843866 Testing R2 0.5911879648718986
lr.coef_
array([[12.88715568, 9.08366371, 5.95002136, -3.29482537, 1.54318323,
1.65154057, 1.78122932, 7.35870478]])
from sklearn.ensemble import RandomForestRegressor
#With Pipeline and PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import RFE,SelectFromModel
X=demo.drop('csMPa',axis=1)
y=demo[['csMPa']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
pipe = Pipeline((
("poly",PolynomialFeatures(degree=2)),
("sfm",SelectFromModel(estimator=RandomForestRegressor(),max_features=5)),
("lr", LinearRegression()),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))
Training R2 0.532956772745524 Testing R2 0.5193194455702451
#With Pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
X=demo.drop('csMPa',axis=1)
y=demo[['csMPa']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
pipe = Pipeline((
("sc",StandardScaler()),
("lr", LinearRegression()),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))
scoresdt = cross_val_score(pipe,Xtrain,ytrain,cv=10)
print(scoresdt)
print("Average R2")
print(np.mean(scoresdt))
Training R2 0.6210697467843866 Testing R2 0.5911879648718986 [0.71979944 0.55030814 0.62680426 0.35784287 0.60852661 0.64955735 0.60723933 0.74942929 0.55857665 0.56769243] Average R2 0.5995776369504165
pipe['lr'].coef_
array([[12.88715568, 9.08366371, 5.95002136, -3.29482537, 1.54318323,
1.65154057, 1.78122932, 7.35870478]])
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
X=demo.drop('csMPa',axis=1)
y=demo[['csMPa']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
pt=PowerTransformer()
powerXtrain = pt.fit_transform(Xtrain)
powerXtest = pt.transform(Xtest)
lr = LinearRegression()
lr.fit(powerXtrain,ytrain)
print("Training R2")
print(lr.score(powerXtrain,ytrain))
print("Testing R2")
print(lr.score(powerXtest,ytest))
Training R2 0.8080840977706969 Testing R2 0.7997646551542748
#With Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
X=demo.drop('csMPa',axis=1)
y=demo[['csMPa']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
pipe = Pipeline((
("pt",PowerTransformer()),
("lr", LinearRegression()),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))
scoresdt = cross_val_score(pipe,Xtrain,ytrain,cv=10)
print(scoresdt)
print("Average R2")
print(np.mean(scoresdt))
Training R2 0.8080840977706969 Testing R2 0.7997646551542748 [0.82406782 0.82312497 0.77916128 0.67511077 0.78006356 0.76090952 0.80278288 0.87637862 0.82497253 0.80589979] Average R2 0.7952471755402372
#With Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
X=demo.drop('csMPa',axis=1)
y=demo[['csMPa']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
pipe = Pipeline((
("lr", DecisionTreeRegressor(max_depth=10)),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))
scoresdt = cross_val_score(pipe,Xtrain,ytrain,cv=10)
print(scoresdt)
print("Average R2")
print(np.mean(scoresdt))
Training R2 0.9767176875905771 Testing R2 0.8778558264076294 [0.87305049 0.91500011 0.8767729 0.50537323 0.89880623 0.81334744 0.8032829 0.89379128 0.80434132 0.83197514] Average R2 0.8215741045745162
#Will regularisation help ?
#With Pipeline
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import Lasso,Ridge
X=demo.drop('csMPa',axis=1)
y=demo[['csMPa']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
pipe = Pipeline((
("pt",PowerTransformer()),
("lr", Lasso(alpha=1)),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))
scoresdt = cross_val_score(pipe,Xtrain,ytrain,cv=10)
print(scoresdt)
print("Average R2")
print(np.mean(scoresdt))
Training R2 0.7846330811838018 Testing R2 0.786292589828556 [0.80604756 0.79003713 0.77491562 0.67917305 0.74901259 0.75473434 0.7595385 0.84837498 0.79395608 0.78396381] Average R2 0.7739753659690156
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import Lasso,Ridge
X=demo.drop('csMPa',axis=1)
y=demo[['csMPa']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
pipe = Pipeline((
("pt", PowerTransformer()),
("poly",PolynomialFeatures(degree=2)),
("lr", LinearRegression())
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))
scoresdt = cross_val_score(pipe,Xtrain,ytrain,cv=10)
print(scoresdt)
print("Average R2")
print(np.mean(scoresdt))
Training R2 0.8862102587272438 Testing R2 0.8693185642957082 [0.85955564 0.88585786 0.86074545 0.7434244 0.84908628 0.86873768 0.85226966 0.93265428 0.8801613 0.84936551] Average R2 0.858185806795172
#With Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE
X=demo.drop('csMPa',axis=1)
y=demo[['csMPa']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
pipe = Pipeline((
("pt",PowerTransformer()),
("poly",PolynomialFeatures(degree=3)),
("rfe",RFE(estimator=LinearRegression(),n_features_to_select=70)),
("lr", LinearRegression()),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))
scoresdt = cross_val_score(pipe,Xtrain,ytrain,cv=10)
print(scoresdt)
print("Average R2")
print(np.mean(scoresdt))
Training R2 0.9290736283119139 Testing R2 0.9139744254138868 [0.8791217 0.8622758 0.87701059 0.81614708 0.89297175 0.92354062 0.78626263 0.94093591 0.78263827 0.80604357] Average R2 0.8566947906440096
#With Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
X=demo.drop('csMPa',axis=1)
y=demo[['csMPa']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
pipe = Pipeline((
("pt",PowerTransformer()),
("poly",PolynomialFeatures(degree=3)),
("pca",PCA(n_components=100)),
("lr", LinearRegression()),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))
scoresdt = cross_val_score(pipe,Xtrain,ytrain,cv=10)
print(scoresdt)
print("Average R2")
print(np.mean(scoresdt))
print()
print(pipe['pca'].explained_variance_ratio_)
Training R2 0.9128605223026308 Testing R2 0.8953976488385932 [0.88667223 0.88397477 0.8503153 0.72943138 0.91101421 0.87227558 0.91686935 0.92904147 0.85879424 0.82294363] Average R2 0.8661332166325666 [2.15280592e-01 1.09183333e-01 1.01954231e-01 5.92864375e-02 5.55839677e-02 4.16287563e-02 2.98326803e-02 2.49288126e-02 2.28337896e-02 2.03360687e-02 1.81909377e-02 1.66681798e-02 1.57906802e-02 1.53152999e-02 1.37357094e-02 1.26471511e-02 1.17074910e-02 1.11183245e-02 9.87953078e-03 9.48813929e-03 9.27406870e-03 8.72082068e-03 8.20359178e-03 7.59015177e-03 6.90003720e-03 6.49984384e-03 6.05806962e-03 5.78582638e-03 5.60743248e-03 5.52983791e-03 5.24804478e-03 5.07042081e-03 4.76715671e-03 4.48506992e-03 4.23765717e-03 4.14396010e-03 3.88270902e-03 3.70491702e-03 3.58224192e-03 3.34361343e-03 3.30874147e-03 3.09712240e-03 3.04309560e-03 2.88541661e-03 2.75966284e-03 2.54243489e-03 2.53132283e-03 2.47911380e-03 2.33108193e-03 2.19089690e-03 2.11196836e-03 2.03325493e-03 1.97309762e-03 1.83628528e-03 1.76694493e-03 1.67943605e-03 1.64296392e-03 1.55294777e-03 1.49980054e-03 1.40422168e-03 1.39177797e-03 1.28857595e-03 1.27008083e-03 1.20599232e-03 1.11521094e-03 1.05400121e-03 9.80483868e-04 9.46797131e-04 9.22245809e-04 8.86543603e-04 8.47106286e-04 8.35235402e-04 7.99182206e-04 6.95189457e-04 6.91195652e-04 6.63171472e-04 5.90165487e-04 5.70462064e-04 5.58477699e-04 5.33582023e-04 5.08165348e-04 4.99911323e-04 4.70900950e-04 4.36056463e-04 4.11947316e-04 3.92012526e-04 3.80752860e-04 3.68579751e-04 3.38304555e-04 3.30338203e-04 3.00108931e-04 2.81882943e-04 2.72294948e-04 2.56397909e-04 2.45624234e-04 2.28448256e-04 2.22242670e-04 2.19668812e-04 2.02234031e-04 2.00261072e-04]
#With Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
X=demo.drop('csMPa',axis=1)
y=demo[['csMPa']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=20)
pipe = Pipeline((
#("pt",PowerTransformer()),
("lr", DecisionTreeRegressor(max_depth=20)),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))
scoresdt = cross_val_score(pipe,Xtrain,ytrain,cv=10)
print(scoresdt)
print("Average R2")
print(np.mean(scoresdt))
Training R2 0.9968823376499383 Testing R2 0.7908009133940821 [0.71072672 0.81772234 0.74816022 0.7824468 0.8825896 0.87924974 0.85906775 0.8749115 0.87349404 0.84127711] Average R2 0.8269645814531321
#With Pipeline
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
X=demo.drop('csMPa',axis=1)
y=demo[['csMPa']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
pipe = Pipeline((
("lr", RandomForestRegressor(n_estimators=200,random_state=10)),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))
scoresdt = cross_val_score(pipe,Xtrain,ytrain,cv=10)
print(scoresdt)
print("Average R2")
print(np.mean(scoresdt))
Training R2 0.9845267622955424 Testing R2 0.9220530902702947 [0.9020328 0.9296384 0.91840761 0.73954917 0.91616853 0.89726903 0.90132621 0.92512011 0.90951474 0.9039966 ] Average R2 0.8943023205600831
#With Pipeline
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
X=demo.drop('csMPa',axis=1)
y=demo[['csMPa']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
pipe = Pipeline((
#("pt",PowerTransformer()),
("lr", GradientBoostingRegressor(n_estimators=1500)),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))
scoresdt = cross_val_score(pipe,Xtrain,ytrain,cv=10)
print(scoresdt)
print("Average R2")
print(np.mean(scoresdt))
Training R2 0.9927791355401739 Testing R2 0.9430031487340271 [0.93275073 0.94844072 0.93817591 0.79124773 0.93032166 0.94568298 0.94856183 0.96173984 0.92229248 0.89737315] Average R2 0.9216587028168263
pipe['lr'].estimators_
array([[DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
random_state=RandomState(MT19937) at 0x221B322DE40)],
[DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
random_state=RandomState(MT19937) at 0x221B322DE40)],
[DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
random_state=RandomState(MT19937) at 0x221B322DE40)],
...,
[DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
random_state=RandomState(MT19937) at 0x221B322DE40)],
[DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
random_state=RandomState(MT19937) at 0x221B322DE40)],
[DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
random_state=RandomState(MT19937) at 0x221B322DE40)]],
dtype=object)
tree1 = pipe['lr'].estimators_[1][0]
print(tree1)
DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
random_state=RandomState(MT19937) at 0x221B322DE40)
from sklearn.tree import export_graphviz
export_graphviz(tree1,out_file = 'tree.dot',feature_names=Xtrain.columns)
!dot -Tpng tree.dot > tree.png # to convert the tree in a png image
'dot' is not recognized as an internal or external command, operable program or batch file.
from IPython.display import Image
Image(filename='tree.png')
#With Pipeline
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
X=demo.drop('csMPa',axis=1)
y=demo[['csMPa']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
pipe = Pipeline((
#("pt",PowerTransformer()),
("lr", AdaBoostRegressor(n_estimators=10,base_estimator=LinearRegression(),random_state=10)),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))
scoresdt = cross_val_score(pipe,Xtrain,ytrain,cv=10)
print(scoresdt)
print("Average R2")
print(np.mean(scoresdt))
Training R2 0.6142957558574607 Testing R2 0.5869666439743113 [0.71313474 0.56300258 0.6046258 0.38858946 0.61876149 0.64043809 0.61862094 0.72828601 0.55533087 0.5825105 ] Average R2 0.6013300480615988
print(__doc__)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import validation_curve
param_range = np.arange(1,50)
train_scores, test_scores = validation_curve(
DecisionTreeRegressor(), Xtrain, ytrain, param_name="max_depth", param_range=param_range,n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.title("Validation Curve with Decision Tree Regressor")
plt.xlabel("Depth")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2,
color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2,
color="navy", lw=lw)
plt.legend(loc="best")
plt.show()
Automatically created module for IPython interactive environment
2022-08-20 12:27:13,190 - WARNING - findfont: Font family ['Heiti TC'] not found. Falling back to DejaVu Sans. 2022-08-20 12:27:13,408 - WARNING - findfont: Font family ['Heiti TC'] not found. Falling back to DejaVu Sans. 2022-08-20 12:27:13,431 - WARNING - findfont: Font family ['Heiti TC'] not found. Falling back to DejaVu Sans.
print(__doc__)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import validation_curve
param_range = np.arange(1,500,5)
train_scores, test_scores = validation_curve(
GradientBoostingRegressor(), Xtrain, ytrain, param_name="n_estimators", param_range=param_range,n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.title("Validation Curve with Gradient Boosting Regressor")
plt.xlabel("Estimators")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2,
color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2,
color="navy", lw=lw)
plt.legend(loc="best")
plt.show()
Automatically created module for IPython interactive environment
#Hyperparameter Tuning with Pipeline
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
pipe = Pipeline((
("pt",PowerTransformer()),
("poly",PolynomialFeatures()),
("rfe",RFE(estimator=LinearRegression())),
("lr", LinearRegression()),
))
param_grid = {
'poly__degree' : [1,2,3],
'rfe__n_features_to_select' : [10,20,30,40,50,60,70,80,90,100]
}
search = GridSearchCV(pipe,param_grid,cv=5)
search.fit(Xtrain,ytrain)
GridSearchCV(cv=5,
estimator=Pipeline(steps=(('pt', PowerTransformer()),
('poly', PolynomialFeatures()),
('rfe',
RFE(estimator=LinearRegression())),
('lr', LinearRegression()))),
param_grid={'poly__degree': [1, 2, 3],
'rfe__n_features_to_select': [10, 20, 30, 40, 50, 60,
70, 80, 90, 100]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,
estimator=Pipeline(steps=(('pt', PowerTransformer()),
('poly', PolynomialFeatures()),
('rfe',
RFE(estimator=LinearRegression())),
('lr', LinearRegression()))),
param_grid={'poly__degree': [1, 2, 3],
'rfe__n_features_to_select': [10, 20, 30, 40, 50, 60,
70, 80, 90, 100]})Pipeline(steps=(('pt', PowerTransformer()), ('poly', PolynomialFeatures()),
('rfe', RFE(estimator=LinearRegression())),
('lr', LinearRegression())))PowerTransformer()
PolynomialFeatures()
RFE(estimator=LinearRegression())
LinearRegression()
LinearRegression()
LinearRegression()
search.best_params_
{'poly__degree': 2, 'rfe__n_features_to_select': 50}
#Performance on the CV set
search.best_score_
0.8613965128762686
#Performance on the test set
search.score(Xtest,ytest)
0.8693185642957082
search.cv_results_
{'mean_fit_time': array([0.04461918, 0.0339056 , 0.03150325, 0.04480567, 0.02681513,
0.02590356, 0.02030091, 0.02345381, 0.02593932, 0.03114247,
0.07655301, 0.05768166, 0.0478054 , 0.0320703 , 0.02593374,
0.02914829, 0.02824335, 0.03007202, 0.02570052, 0.02960224,
0.51362691, 0.48954029, 0.49656534, 0.4664752 , 0.45608735,
0.43631754, 0.41232214, 0.37949929, 0.35493908, 0.37396069]),
'std_fit_time': array([0.01089979, 0.00268909, 0.00515309, 0.01973224, 0.00779397,
0.01097073, 0.00714635, 0.00590211, 0.00671236, 0.01033104,
0.00583854, 0.00704249, 0.00103699, 0.00440527, 0.0064165 ,
0.00408259, 0.01070597, 0.00431428, 0.00725036, 0.00567829,
0.01945955, 0.0081033 , 0.01040959, 0.015616 , 0.01549348,
0.01381501, 0.01558777, 0.01520393, 0.01088077, 0.02517876]),
'mean_score_time': array([0.00631404, 0.0055809 , 0.00185161, 0.00784783, 0.00166488,
0.00493608, 0.01002626, 0.00131741, 0.00312543, 0.00070133,
0.00160265, 0.00996599, 0.00628057, 0.00333738, 0.00332584,
0.00312634, 0.00039902, 0.00150847, 0.00627413, 0. ,
0.00312562, 0.00312424, 0.00047407, 0. , 0.00164104,
0. , 0.00938382, 0.00312428, 0.00312638, 0.00313859]),
'std_score_time': array([0.00299319, 0.00072134, 0.0024191 , 0.00586939, 0.00332975,
0.0066389 , 0.00779606, 0.00263481, 0.00625086, 0.00140266,
0.0032053 , 0.00816775, 0.00769224, 0.00667477, 0.00616028,
0.00625267, 0.00079803, 0.00301695, 0.00768422, 0. ,
0.00625124, 0.00624847, 0.00094814, 0. , 0.00241133,
0. , 0.00766187, 0.00624857, 0.00625277, 0.00627718]),
'param_poly__degree': masked_array(data=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False],
fill_value='?',
dtype=object),
'param_rfe__n_features_to_select': masked_array(data=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 10, 20, 30,
40, 50, 60, 70, 80, 90, 100, 10, 20, 30, 40, 50, 60,
70, 80, 90, 100],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'poly__degree': 1, 'rfe__n_features_to_select': 10},
{'poly__degree': 1, 'rfe__n_features_to_select': 20},
{'poly__degree': 1, 'rfe__n_features_to_select': 30},
{'poly__degree': 1, 'rfe__n_features_to_select': 40},
{'poly__degree': 1, 'rfe__n_features_to_select': 50},
{'poly__degree': 1, 'rfe__n_features_to_select': 60},
{'poly__degree': 1, 'rfe__n_features_to_select': 70},
{'poly__degree': 1, 'rfe__n_features_to_select': 80},
{'poly__degree': 1, 'rfe__n_features_to_select': 90},
{'poly__degree': 1, 'rfe__n_features_to_select': 100},
{'poly__degree': 2, 'rfe__n_features_to_select': 10},
{'poly__degree': 2, 'rfe__n_features_to_select': 20},
{'poly__degree': 2, 'rfe__n_features_to_select': 30},
{'poly__degree': 2, 'rfe__n_features_to_select': 40},
{'poly__degree': 2, 'rfe__n_features_to_select': 50},
{'poly__degree': 2, 'rfe__n_features_to_select': 60},
{'poly__degree': 2, 'rfe__n_features_to_select': 70},
{'poly__degree': 2, 'rfe__n_features_to_select': 80},
{'poly__degree': 2, 'rfe__n_features_to_select': 90},
{'poly__degree': 2, 'rfe__n_features_to_select': 100},
{'poly__degree': 3, 'rfe__n_features_to_select': 10},
{'poly__degree': 3, 'rfe__n_features_to_select': 20},
{'poly__degree': 3, 'rfe__n_features_to_select': 30},
{'poly__degree': 3, 'rfe__n_features_to_select': 40},
{'poly__degree': 3, 'rfe__n_features_to_select': 50},
{'poly__degree': 3, 'rfe__n_features_to_select': 60},
{'poly__degree': 3, 'rfe__n_features_to_select': 70},
{'poly__degree': 3, 'rfe__n_features_to_select': 80},
{'poly__degree': 3, 'rfe__n_features_to_select': 90},
{'poly__degree': 3, 'rfe__n_features_to_select': 100}],
'split0_test_score': array([0.82800264, 0.82800264, 0.82800264, 0.82800264, 0.82800264,
0.82800264, 0.82800264, 0.82800264, 0.82800264, 0.82800264,
0.85818012, 0.85814299, 0.8751552 , 0.87791802, 0.87788563,
0.87788563, 0.87788563, 0.87788563, 0.87788563, 0.87788563,
0.83149452, 0.84446108, 0.85816308, 0.86270717, 0.86508733,
0.87072702, 0.85651566, 0.87605614, 0.86316668, 0.86818852]),
'split1_test_score': array([0.73997504, 0.73997504, 0.73997504, 0.73997504, 0.73997504,
0.73997504, 0.73997504, 0.73997504, 0.73997504, 0.73997504,
0.78873244, 0.81944162, 0.82040054, 0.81960746, 0.81888645,
0.81888645, 0.81888645, 0.81888645, 0.81888645, 0.81888645,
0.78879427, 0.79771252, 0.79744578, 0.80448703, 0.82956387,
0.83431463, 0.82233401, 0.82116235, 0.80038817, 0.7908565 ]),
'split2_test_score': array([0.77737989, 0.77737989, 0.77737989, 0.77737989, 0.77737989,
0.77737989, 0.77737989, 0.77737989, 0.77737989, 0.77737989,
0.81704133, 0.83295255, 0.85906815, 0.85886851, 0.85868123,
0.85868123, 0.85868123, 0.85868123, 0.85868123, 0.85868123,
0.82158708, 0.8409183 , 0.83877771, 0.86551032, 0.88010262,
0.88163695, 0.88153518, 0.86922739, 0.86311455, 0.84978102]),
'split3_test_score': array([0.84184551, 0.84184551, 0.84184551, 0.84184551, 0.84184551,
0.84184551, 0.84184551, 0.84184551, 0.84184551, 0.84184551,
0.87649446, 0.86978949, 0.88403831, 0.88734618, 0.88811018,
0.88811018, 0.88811018, 0.88811018, 0.88811018, 0.88811018,
0.86102133, 0.85877654, 0.8856453 , 0.87285211, 0.88062025,
0.89220362, 0.83715283, 0.68801361, 0.73523914, 0.76300051]),
'split4_test_score': array([0.81597028, 0.81597028, 0.81597028, 0.81597028, 0.81597028,
0.81597028, 0.81597028, 0.81597028, 0.81597028, 0.81597028,
0.82623701, 0.84456578, 0.86300308, 0.86262732, 0.86341907,
0.86341907, 0.86341907, 0.86341907, 0.86341907, 0.86341907,
0.78808893, 0.8176141 , 0.83249249, 0.82314908, 0.81839289,
0.80752637, 0.72758725, 0.56550904, 0.596726 , 0.52483761]),
'mean_test_score': array([0.80063467, 0.80063467, 0.80063467, 0.80063467, 0.80063467,
0.80063467, 0.80063467, 0.80063467, 0.80063467, 0.80063467,
0.83333707, 0.84497849, 0.86033306, 0.8612735 , 0.86139651,
0.86139651, 0.86139651, 0.86139651, 0.86139651, 0.86139651,
0.81819723, 0.83189651, 0.84250487, 0.84574114, 0.85475339,
0.85728172, 0.82502499, 0.76399371, 0.77172691, 0.75933283]),
'std_test_score': array([0.03715614, 0.03715614, 0.03715614, 0.03715614, 0.03715614,
0.03715614, 0.03715614, 0.03715614, 0.03715614, 0.03715614,
0.0309363 , 0.01780907, 0.02184019, 0.02324005, 0.02368155,
0.02368155, 0.02368155, 0.02368155, 0.02368155, 0.02368155,
0.02754362, 0.02160708, 0.02915729, 0.02692944, 0.02598101,
0.03161195, 0.05259944, 0.12005791, 0.09948977, 0.12330907]),
'rank_test_score': array([18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 14, 12, 8, 7, 1, 1, 1,
1, 1, 1, 17, 15, 13, 11, 10, 9, 16, 29, 28, 30])}
#With Pipeline (Transformation Pipeline Only, model separate)
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
pipe = Pipeline((
("sc",StandardScaler()),
("poly",PolynomialFeatures(degree=1)),
("pt",PowerTransformer()),
("pca",PCA())
))
preparedtrain = pipe.fit_transform(Xtrain)
preparedtest = pipe.transform(Xtest)
lr=LinearRegression()
scoresdt = cross_val_score(lr,preparedtrain,ytrain,cv=10)
print(scoresdt)
print("Average R2")
print(np.mean(scoresdt))
print("SD of accuracy")
print(np.std(scoresdt))
print("Coefficient of Variation")
print(np.std(scoresdt)/np.mean(scoresdt))
[0.83755397 0.82837868 0.77253378 0.67948574 0.78215895 0.79980915 0.80446461 0.88400822 0.82202225 0.83543769] Average R2 0.804585303621326 SD of accuracy 0.05140033404946304 Coefficient of Variation 0.0638842566700104
#With Pipeline (Transformation Pipeline Only, model separate)
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA
pipe = Pipeline((
("sc",StandardScaler()),
("poly",PolynomialFeatures(degree=1)),
("pt",PowerTransformer()),
("pca",PCA())
))
preparedtrain = pipe.fit_transform(Xtrain)
preparedtest = pipe.transform(Xtest)
lr=DecisionTreeRegressor()
scoresdt = cross_val_score(lr,preparedtrain,ytrain,cv=10)
print(scoresdt)
print("Average R2")
print(np.mean(scoresdt))
print("SD of accuracy")
print(np.std(scoresdt))
print("Coefficient of Variation")
print(np.std(scoresdt)/np.mean(scoresdt))
[0.78183552 0.82942633 0.83501023 0.48679804 0.71999849 0.80943426 0.61871217 0.82377361 0.77232752 0.77668622] Average R2 0.745400238315989 SD of accuracy 0.10579920141680962 Coefficient of Variation 0.14193609819045883
import warnings
warnings.filterwarnings('ignore')
#With Pipeline (Transformation Pipeline Only, model separate)
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA
pipe = Pipeline((
("sc",StandardScaler()),
("poly",PolynomialFeatures(degree=1)),
("pt",PowerTransformer()),
("pca",PCA())
))
preparedtrain = pipe.fit_transform(Xtrain)
preparedtest = pipe.transform(Xtest)
lr=GradientBoostingRegressor(n_estimators=1500)
scoresdt = cross_val_score(lr,preparedtrain,ytrain,cv=10)
print(scoresdt)
print("Average R2")
print(np.mean(scoresdt))
print("SD of accuracy")
print(np.std(scoresdt))
print("Coefficient of Variation")
print(np.std(scoresdt)/np.mean(scoresdt))
[0.92898333 0.93096148 0.89389379 0.72794884 0.87624 0.91358472 0.90332407 0.92659893 0.88896537 0.90630058] Average R2 0.8896801113869902 SD of accuracy 0.05656998293355115 Coefficient of Variation 0.06358463250949815
#Hyperparameter Tuning with Pipeline
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
pipe = Pipeline((
("sc",StandardScaler()),
("poly",PolynomialFeatures()),
("pt",PowerTransformer()),
("pca",PCA()),
("xb",GradientBoostingRegressor())
))
param_grid = {
'poly__degree' : [2,3],
'pca__n_components' : [30,40],
'xb__n_estimators' : [10,20,30,40,50]
}
search = GridSearchCV(pipe,param_grid,cv=5)
search.fit(Xtrain,ytrain)
GridSearchCV(cv=5,
estimator=Pipeline(steps=(('sc', StandardScaler()),
('poly', PolynomialFeatures()),
('pt', PowerTransformer()),
('pca', PCA()),
('xb', GradientBoostingRegressor()))),
param_grid={'pca__n_components': [30, 40], 'poly__degree': [2, 3],
'xb__n_estimators': [10, 20, 30, 40, 50]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,
estimator=Pipeline(steps=(('sc', StandardScaler()),
('poly', PolynomialFeatures()),
('pt', PowerTransformer()),
('pca', PCA()),
('xb', GradientBoostingRegressor()))),
param_grid={'pca__n_components': [30, 40], 'poly__degree': [2, 3],
'xb__n_estimators': [10, 20, 30, 40, 50]})Pipeline(steps=(('sc', StandardScaler()), ('poly', PolynomialFeatures()),
('pt', PowerTransformer()), ('pca', PCA()),
('xb', GradientBoostingRegressor())))StandardScaler()
PolynomialFeatures()
PowerTransformer()
PCA()
GradientBoostingRegressor()
search.best_params_
{'pca__n_components': 40, 'poly__degree': 3, 'xb__n_estimators': 50}
#Performance on the CV set
search.best_score_
0.7693164032644957
#Performance on the test set
search.score(Xtest,ytest)
0.8278281327603929
search.cv_results_
{'mean_fit_time': array([0.17909555, 0.21754951, 0.25751715, 0.30564833, 0.38976583,
0.4412683 , 0.52639647, 0.53987942, 0.59593072, 0.639257 ,
0.18944979, 0.24179244, 0.30838809, 0.38838096, 0.43179712,
0.49912648, 0.53541188, 0.60887065, 0.66554699, 0.73784704]),
'std_fit_time': array([0.03489127, 0.0121825 , 0.01041793, 0.01750667, 0.05602843,
0.01915068, 0.01242176, 0.03206063, 0.03081858, 0.02626644,
0.01300329, 0.00890885, 0.01822489, 0.05700012, 0.00776192,
0.05898207, 0.03075006, 0.03156214, 0.03502238, 0.05471011]),
'mean_score_time': array([0.00493269, 0. , 0.00220132, 0.01030612, 0.01131053,
0.01201177, 0.01547151, 0.01016688, 0.01118731, 0.01401072,
0.00160074, 0.00869679, 0.00030093, 0.006847 , 0.00280099,
0.0163826 , 0.01450005, 0.01267681, 0.01298552, 0.01299448]),
'std_score_time': array([0.00625313, 0. , 0.00440264, 0.00764323, 0.00818828,
0.00621898, 0.00487088, 0.00582391, 0.00644895, 0.00621721,
0.00320148, 0.00796393, 0.00060186, 0.00609915, 0.00560198,
0.00062981, 0.00224848, 0.0063477 , 0.01074462, 0.00780066]),
'param_pca__n_components': masked_array(data=[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False],
fill_value='?',
dtype=object),
'param_poly__degree': masked_array(data=[2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 3, 3, 3,
3, 3],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False],
fill_value='?',
dtype=object),
'param_xb__n_estimators': masked_array(data=[10, 20, 30, 40, 50, 10, 20, 30, 40, 50, 10, 20, 30, 40,
50, 10, 20, 30, 40, 50],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'pca__n_components': 30,
'poly__degree': 2,
'xb__n_estimators': 10},
{'pca__n_components': 30, 'poly__degree': 2, 'xb__n_estimators': 20},
{'pca__n_components': 30, 'poly__degree': 2, 'xb__n_estimators': 30},
{'pca__n_components': 30, 'poly__degree': 2, 'xb__n_estimators': 40},
{'pca__n_components': 30, 'poly__degree': 2, 'xb__n_estimators': 50},
{'pca__n_components': 30, 'poly__degree': 3, 'xb__n_estimators': 10},
{'pca__n_components': 30, 'poly__degree': 3, 'xb__n_estimators': 20},
{'pca__n_components': 30, 'poly__degree': 3, 'xb__n_estimators': 30},
{'pca__n_components': 30, 'poly__degree': 3, 'xb__n_estimators': 40},
{'pca__n_components': 30, 'poly__degree': 3, 'xb__n_estimators': 50},
{'pca__n_components': 40, 'poly__degree': 2, 'xb__n_estimators': 10},
{'pca__n_components': 40, 'poly__degree': 2, 'xb__n_estimators': 20},
{'pca__n_components': 40, 'poly__degree': 2, 'xb__n_estimators': 30},
{'pca__n_components': 40, 'poly__degree': 2, 'xb__n_estimators': 40},
{'pca__n_components': 40, 'poly__degree': 2, 'xb__n_estimators': 50},
{'pca__n_components': 40, 'poly__degree': 3, 'xb__n_estimators': 10},
{'pca__n_components': 40, 'poly__degree': 3, 'xb__n_estimators': 20},
{'pca__n_components': 40, 'poly__degree': 3, 'xb__n_estimators': 30},
{'pca__n_components': 40, 'poly__degree': 3, 'xb__n_estimators': 40},
{'pca__n_components': 40, 'poly__degree': 3, 'xb__n_estimators': 50}],
'split0_test_score': array([0.50189925, 0.66450935, 0.74455995, 0.78440057, 0.8044589 ,
0.52921319, 0.68618659, 0.74523857, 0.75028958, 0.78748738,
0.49828454, 0.68489611, 0.75807466, 0.7956503 , 0.81638112,
0.54726058, 0.70440042, 0.75610062, 0.77560619, 0.79720146]),
'split1_test_score': array([0.46140235, 0.59870633, 0.66269429, 0.70810989, 0.72407238,
0.49098678, 0.63296589, 0.69689592, 0.72278832, 0.74571573,
0.4621014 , 0.60326816, 0.66925781, 0.70744461, 0.7287972 ,
0.49945808, 0.642076 , 0.70345609, 0.72957757, 0.75145923]),
'split2_test_score': array([0.4393115 , 0.58879137, 0.66609398, 0.71330606, 0.74664221,
0.44674827, 0.61667915, 0.68263564, 0.70532518, 0.72243616,
0.42765952, 0.57806677, 0.65250774, 0.7041596 , 0.73046838,
0.44290836, 0.6191157 , 0.67829519, 0.73490798, 0.7450547 ]),
'split3_test_score': array([0.50465369, 0.68335976, 0.75102022, 0.77964832, 0.80266428,
0.55216449, 0.70394941, 0.7472826 , 0.78744672, 0.80337305,
0.49917547, 0.67222509, 0.73921531, 0.77928835, 0.79915444,
0.55441466, 0.7124003 , 0.76544746, 0.79969618, 0.8135224 ]),
'split4_test_score': array([0.42884327, 0.59098612, 0.67331079, 0.72501358, 0.75505316,
0.45254381, 0.60964324, 0.70056266, 0.7274162 , 0.75456088,
0.42884327, 0.58963172, 0.67481202, 0.72976296, 0.74993285,
0.44545587, 0.60794378, 0.68928653, 0.72787824, 0.73934423]),
'mean_test_score': array([0.46722201, 0.62527059, 0.69953585, 0.74209568, 0.76657819,
0.49433131, 0.64988486, 0.71452308, 0.7386532 , 0.76271464,
0.46321284, 0.62561757, 0.69877351, 0.74326116, 0.7649468 ,
0.49789951, 0.65718724, 0.71851718, 0.75353323, 0.7693164 ]),
'std_test_score': array([0.03127118, 0.0403134 , 0.03960107, 0.03309253, 0.03185645,
0.04143175, 0.03807637, 0.02660469, 0.02830171, 0.02913622,
0.03152737, 0.04414015, 0.04180434, 0.0375141 , 0.03615695,
0.04776457, 0.04331345, 0.03553601, 0.02897304, 0.03012503]),
'rank_test_score': array([19, 16, 11, 7, 2, 18, 14, 10, 8, 4, 20, 15, 12, 6, 3, 17, 13,
9, 5, 1])}
#Confidence Interval Of the Accuracy
import warnings
warnings.filterwarnings('ignore')
import scipy.stats as stats
#With Pipeline (Transformation Pipeline Only, model separate)
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
pipe = Pipeline((
("sc",StandardScaler()),
("poly",PolynomialFeatures(degree=1)),
("pt",PowerTransformer()),
("pca",PCA()),
("lr", GradientBoostingRegressor())
))
scoresdt = cross_val_score(pipe,Xtrain,ytrain,cv=10)
print(scoresdt)
print("Average R2")
print(np.mean(scoresdt))
print("SD of accuracy")
print(np.std(scoresdt))
#Confidence Interval
xbar = np.mean(scoresdt)
n=10
s=np.std(scoresdt,ddof=1)
se=s/np.sqrt(n)
stats.t.interval(0.95,df=9,loc=xbar,scale=se)
[0.90259011 0.89879954 0.82637242 0.75685962 0.86542843 0.88783753 0.85223095 0.90848942 0.86757857 0.87410687] Average R2 0.8640293440581294 SD of accuracy 0.04289304660925258
(0.831685739851828, 0.8963729482644307)
import warnings
warnings.filterwarnings('ignore')
#Model Comparison
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor,RandomForestRegressor,VotingRegressor
from sklearn import model_selection
X=demo.drop('csMPa',axis=1)
y=demo[['csMPa']]
lr = LinearRegression()
dt = DecisionTreeRegressor()
rf = RandomForestRegressor(n_estimators=1000)
gb = GradientBoostingRegressor(n_estimators=1500)
vr = VotingRegressor(estimators = [('lr',lr),('dt', dt)])
models = []
models.append(('Linear Regression',lr))
models.append(('Decision Tree Regression',dt))
models.append(('Random Forest Regression',rf))
models.append(('Gradient Boosting Regression',gb))
models.append(('Voting Regressor',vr))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10)
cv_results = model_selection.cross_val_score(model, Xtrain, ytrain, cv=kfold)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, np.median(cv_results), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
Linear Regression: 0.607883 (0.101906) Decision Tree Regression: 0.860376 (0.091311) Random Forest Regression: 0.913167 (0.052689) Gradient Boosting Regression: 0.936840 (0.046315) Voting Regressor: 0.830142 (0.071303)
results
[array([0.71979944, 0.55030814, 0.62680426, 0.35784287, 0.60852661,
0.64955735, 0.60723933, 0.74942929, 0.55857665, 0.56769243]),
array([0.87043629, 0.92879617, 0.87057518, 0.58166291, 0.88051702,
0.83918381, 0.80282728, 0.89841906, 0.81221208, 0.85031483]),
array([0.9070357 , 0.93177667, 0.9158592 , 0.74051332, 0.91596982,
0.89758559, 0.90186321, 0.92340175, 0.91572206, 0.91061193]),
array([0.93576212, 0.94829376, 0.93791795, 0.79228121, 0.93031221,
0.94560607, 0.94867402, 0.96081009, 0.92258211, 0.8966009 ]),
array([0.88999557, 0.85855445, 0.82391818, 0.62100202, 0.83394366,
0.84491152, 0.80071862, 0.88007985, 0.81809311, 0.82634115])]
print(np.median(results[2]))
print(np.std(results[2]))
0.9131669965639206 0.05268894565870546
sns.boxplot(results[2])
<AxesSubplot:>